Setup

load libraries and define functions

library(DESeq2) 
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(limma)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
## 
##     plotMA
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(EnhancedVolcano)
## Loading required package: ggplot2
## Loading required package: ggrepel
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
library(AnnotationDbi)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(msigdbr)
library(clusterProfiler)
## 
## clusterProfiler v4.2.2  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
## 
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
library(scales)
library(ggpubr)
library(ggplot2)
library(enrichplot)
## 
## Attaching package: 'enrichplot'
## The following object is masked from 'package:ggpubr':
## 
##     color_palette
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(pheatmap)
library(org.Hs.eg.db)
## 
library(xCell)

# function to read experiment designs from our datasets
read_design <- function(filename) {
  E <- read.csv(filename, sep = "\t")

  rownames(E) <- E$Run
  
  E <- E[c("Sample.Characteristic.disease.", "Sample.Characteristic.organism.part.")]
  colnames(E) <- c("Diagnosis", "Brodmann")

  E[E == "bipolar disorder"] = "bipolar"
  E$`Brodmann` <- sub("Brodmann \\(1909\\) area ", "", E$`Brodmann`)
  
  E[E == "dorsolateral prefrontal cortex"] = "46"
  
  return(E)
}

Load Data

# read the experiments' designs
E7 <- read_design("../Databases/Bipolar - E-GEOD-78936/E-GEOD-78936-experiment-design.tsv")
E5 <- read_design("../Databases/Bipolar - E-GEOD-53239/E-GEOD-53239-experiment-design.tsv")

# add identifiers for each dataset
E5 <- cbind(rep("GEOD-53239", times=nrow(E5)), E5)
E7 <- cbind(rep("GEOD-78936", times=nrow(E7)), E7)

colnames(E5)[1] = colnames(E7)[1] = "Dataset"

# combine metadata
metadata <- rbind(E5, E7)

# read counts data
data1 <- read.delim("../Databases/Bipolar - E-GEOD-53239/E-GEOD-53239-raw-counts.tsv")
data2 <- read.delim("../Databases/Bipolar - E-GEOD-78936/E-GEOD-78936-raw-counts.tsv")

# combine only the counts data
counts <- cbind(data1[-1][-1], data2[-1][-1])
counts <- counts[sort(colnames(counts))]

# take the gene names from the counts data
genes.symbols <- data1[,2]

# if there is no gene name, use the ID
genes.symbols[genes.symbols == ""] <- data1$Gene.ID[genes.symbols == ""]

# replace dupliactes with ID too
genes.symbols[duplicated(genes.symbols)] <- data1$Gene.ID[duplicated(genes.symbols)]

Identifying Biomarker Genes

Run DESeq2

# run DESeq2 on combined data from both experiments
# using Dataset to distinguish between them which will remove the batch effect. 
# Since Dataset is a division contained in Brodmann, using Brodmann is enough
dds <- DESeqDataSetFromMatrix(countData=counts, colData=metadata, design= ~Brodmann + Diagnosis)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 18 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# run different DESeq2 for each brain area
areas <- c(9, 11, 24, 46)
areaDds <- list()
i <- 1
for (i in seq_along(areas)){
  areaDds[[i]] <- DESeqDataSetFromMatrix(countData=counts[,metadata$Brodmann == areas[i]], colData=metadata[metadata$Brodmann == areas[i],], design= ~Diagnosis)
  areaDds[[i]] <- DESeq(areaDds[[i]])
}
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 92 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 45 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 15 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 54 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing

Analyze the data:

# do logfold change for Diagnosis
# normal vs bipolar
resLFC.bipolar.normal <- lfcShrink(dds, contrast = c("Diagnosis", "bipolar", "normal"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
resLFC.bipolar.normal["symbol"] <- genes.symbols
resOrdered.bipolar.normal <- resLFC.bipolar.normal[order(resLFC.bipolar.normal$pvalue),]

# bipolar vs schizophrenia
resLFC.bipolar.schizo <- lfcShrink(dds, contrast = c("Diagnosis", "bipolar", "schizophrenia"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
resLFC.bipolar.schizo["symbol"] <- genes.symbols
resOrdered.bipolar.schizo <- resLFC.bipolar.schizo[order(resLFC.bipolar.schizo$pvalue),]

# differentiate brain areas
resLFC.areas <- list()
resOrdered.areas <- list()
comparisons <- list()

j <- 1

for (i in seq_along(areas)){
  controls = c("normal", "schizophrenia")
  if (areas[i] == 46){
    controls = controls[1]
  }
  
  for (control in controls){
    comparison <- paste0("bipolar vs ", control, " in ", areas[i])
    print(paste0("Calculating ", comparison))
    resLFC.areas[[j]] <- lfcShrink(areaDds[[i]], contrast = c("Diagnosis", "bipolar", control), type = "ashr")
    resLFC.areas[[j]]["symbol"] <- genes.symbols
    resOrdered.areas[[j]] <- resLFC.areas[[j]][order(resLFC.areas[[j]]$pvalue),]
    comparisons[[j]] <- comparison
    j = j + 1
  }
}
## [1] "Calculating bipolar vs normal in 9"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs schizophrenia in 9"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs normal in 11"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs schizophrenia in 11"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs normal in 24"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs schizophrenia in 24"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs normal in 46"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
# normalize our data
dds.norm <- vst(dds, blind=F)
rownames(dds.norm) <- genes.symbols

# remove batch effect for PCA
mat <- assay(dds.norm)
mat <- removeBatchEffect(mat, dds.norm$Dataset)
assay(dds.norm) <- mat

Make some nice looking graphs :)

# create a volcano plot
EnhancedVolcano(resOrdered.bipolar.normal, lab = resOrdered.bipolar.normal$symbol, x = 'log2FoldChange', y = 'padj', FCcutoff=2, pCutoff = 10e-10, title="bipolar vs normal")

EnhancedVolcano(resOrdered.bipolar.schizo, lab = resOrdered.bipolar.schizo$symbol, x = 'log2FoldChange', y = 'padj', FCcutoff=2, pCutoff = 10e-10, title="bipolar vs schizophrenia")

for (i in seq_along(comparisons)){
  print(EnhancedVolcano(resOrdered.areas[[i]], lab = resOrdered.areas[[i]]$symbol, x = 'log2FoldChange', y = 'padj', FCcutoff=2, pCutoff = 10e-10, title=comparisons[i]))
}
## Warning: Removed 2 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_hline).

## Warning: Removed 2 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).

## Warning: Removed 2 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).

## Warning: Removed 2 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).

## Warning: Removed 1 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).

## Warning: Removed 1 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).

# make some pca plots
plotPCA(dds.norm, intgroup=c("Diagnosis"))

plotPCA(dds.norm, intgroup=c("Brodmann"))

plotPCA(dds.norm, intgroup=c("Diagnosis", "Brodmann"))

Some count plots for significant genes :)

rownames(dds) <- genes.symbols

genes <- c("LINC02340", "MTND6P4", "MT1X", "IL1RL1")

means_comp <- list(c("bipolar", "normal"), c("bipolar", "schizophrenia"), c("normal", "schizophrenia"))

for (gene in genes) {
  gene.index <- which(genes.symbols %in% gene)
  plot <- plotCounts(dds, gene = rownames(dds)[gene.index], intgroup = c('Diagnosis'), xlab="Diagnosis", title=paste0(gene, " counts"), returnData = T)
  print(
    ggplot(plot, aes(x = Diagnosis, y = count, fill=Diagnosis)) + 
      geom_violin(draw_quantiles = 0.5) + 
      geom_jitter(color="royalblue", size=0.6, alpha=0.5) + 
      theme(legend.position="none")  + 
      ggtitle(paste0("Gene Expression of ", gene)) +
      scale_y_continuous(trans = log10_trans()) + 
      stat_compare_means(label = "p.signif", comparisons = means_comp,  method = "wilcox.test")
    )
}

areaSpecific <- data.frame(GeneName = c("IGKC", "CHI3L2", "MT1X", "MTND6P4"), Brodmann = c(46, 46, 46, 46))

for (geneIndex in 1:nrow(areaSpecific)) {
  area <- areaSpecific[geneIndex,]$Brodmann
  gene <- areaSpecific[geneIndex,]$GeneName
  thisDds = areaDds[[which(areas == area)]]
  rownames(thisDds) <- genes.symbols
  plot <- plotCounts(thisDds, gene = gene, intgroup = c('Diagnosis'), xlab="Diagnosis", title=paste0(gene, " counts"), returnData = T)
  
  comp = means_comp
  if (area == 46){
    comp = means_comp[1]
  }
  
  print(
    ggplot(plot, aes(x = Diagnosis, y = count, fill=Diagnosis)) + 
      scale_fill_manual(values=c("#f8766d", "#00ba38", "#619cff")) +
      geom_violin(draw_quantiles = 0.5) + 
      geom_jitter(color="royalblue", size=0.6, alpha=0.5) + 
      theme(legend.position="none")  + 
      ggtitle(paste0("Gene Expression of ", gene, " in ", area)) + 
      scale_y_continuous(trans = log10_trans()) + 
      stat_compare_means(label = "p.signif", comparisons = comp, method = "wilcox.test")
    )
}
## Warning in wilcox.test.default(c(0.277745867479051, -0.301029995663981, : cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(c(-0.301029995663981, 0.359390413188311, : cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(c(0.517353315321231, 0.359390413188311,
## 0.314141856807374, : cannot compute exact p-value with ties

Identifying Enriched Pathways

And Pathway analysis :)

# remove NAs and create sorted genes list with LFC values and ENSMBLE ID as names
DE_genes_entrez_rank <- resOrdered.bipolar.normal[!is.na(resOrdered.bipolar.normal$padj),]
DE_genes_list <- DE_genes_entrez_rank$log2FoldChange
names(DE_genes_list) <- DE_genes_entrez_rank$symbol
DE_genes_list <- sort(DE_genes_list, decreasing=TRUE)

# collect hallmarks
hallmarks <- msigdbr(species = "Homo sapiens", category = "H") %>% dplyr::select(gs_name, gene_symbol)

# set nPerm to million so it's absolutely precise and does not change when the random seed changes
hm <- GSEA(DE_genes_list, TERM2GENE=hallmarks, nPerm=1000000)
## preparing geneSet collections...
## GSEA analysis...
## Warning in .GSEA(geneList = geneList, exponent = exponent, minGSSize =
## minGSSize, : We do not recommend using nPerm parameter incurrent and future
## releases
## Warning in fgsea(pathways = geneSets, stats = geneList, nperm = nPerm, minSize
## = minGSSize, : You are trying to run fgseaSimple. It is recommended to use
## fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument
## in the fgsea function call.
## leading edge analysis...
## done...
# create a ridgeplot
ridgeplot(hm) + labs(x = "enrichment distribution")
## Picking joint bandwidth of 0.0798

# and a dot plot
dotplot(hm, showCategory = 178, title="Pathway Analisys with GSEA, normal VS bipolar")

# We tried running the same code on resOrdered.bipolar.schizo, but it found no enriched terms

Clustering (not checked yet)

# get normalized counts only for bipolar
normcounts <- assay(vst(dds[,dds$Diagnosis == "bipolar"], blind=T))
colnames(normcounts) <- paste0("bipolar", seq(1, ncol(normcounts)))
var_per_gene <- apply(normcounts, 1, var)

# take 1000 most significant genes
selectedGenes <- names(var_per_gene[order(var_per_gene, decreasing = T)][1:1000])
normcounts.top1Kvar <- t(normcounts[selectedGenes,])

# make an elbow plot (optimal k seems to be 2 or 3)
fviz_nbclust(normcounts.top1Kvar, FUN = hcut, method = "wss")

dist_mat <- dist(normcounts.top1Kvar, method = 'euclidean')

# plot the cluster dendrogram
hclust_avg <- hclust(dist_mat, method = 'average')
plot(hclust_avg, cex = 0.6, hang = -1)
rect.hclust(hclust_avg, k = 3, border = 2:8)

Heatmap cause we can

# take normalized counts for normal and bipolar
normcounts <- assay(vst(dds[,dds$Diagnosis != "schizophrenia"], blind=T))

dds.symbol <- dds
rownames(dds.symbol) <- mapIds(org.Hs.eg.db,
                              keys=data1$Gene.ID,
                              column="SYMBOL",
                              keytype="ENSEMBL",
                              multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
rownames(dds.symbol)[is.na(rownames(dds.symbol))] <- rownames(dds)[is.na(rownames(dds.symbol))]
rownames(dds.symbol) <- make.unique(rownames(dds.symbol))

df <- data.frame(row.names = colnames(dds.symbol),
                 diagnosis = colData(dds.symbol)$Diagnosis)

# Take top 10 genes with the lowest p-value that express in Bipolar (log2FoldChange>0)
selectUp <- resOrdered.bipolar.normal$symbol[resOrdered.bipolar.normal$log2FoldChange>0][1:20]
# Take top 10 genes with the lowest p-value that express in Healthy (log2FoldChange<0)
selectDown <- resOrdered.bipolar.normal$symbol[resOrdered.bipolar.normal$log2FoldChange<0][1:20]

selected <- c(selectUp,selectDown)

# create the heatmap
pheatmap(normcounts[selected,], cluster_rows=TRUE,
         show_colnames = FALSE, cluster_cols=TRUE, 
         annotation_col=df, scale = 'row', cutree_cols = 2, cutree_rows = 2)

Examining the Changes in the Cellular Composition

xCell

rownames(counts) <- genes.symbols

counts.xCell <- xCellAnalysis(counts[,metadata$Diagnosis == "bipolar"])
## [1] "Num. of genes: 10535"
## Setting parallel calculations through a MulticoreParam back-end
## with workers=4 and tasks=100.
## Estimating ssGSEA scores for 489 gene sets.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  38%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |============================================                          |  62%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=====================================================                 |  75%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |===================================================================   |  95%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |======================================================================| 100%
p <- pheatmap(counts.xCell, show_colnames = F)

counts.xCell.filtered <- counts.xCell[rowMeans(counts.xCell) > 0.01,]
counts.xCell.filtered.scaled <- t(scale(t(counts.xCell.filtered)))
counts.xCell.filtered.scaled[counts.xCell.filtered.scaled>1] <- 1
counts.xCell.filtered.scaled[counts.xCell.filtered.scaled< -1] <- -1
pheatmap(counts.xCell.filtered.scaled, show_colnames = F)

k = 2
myclusters <- cutree(p$tree_col, k=k)

library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
tabyl(metadata[myclusters == 2,], Diagnosis)
##      Diagnosis  n   percent
##        bipolar 47 0.4895833
##         normal 37 0.3854167
##  schizophrenia 12 0.1250000
comps <- t(combn(1:k, 2))
comps <- split(comps, seq(nrow(comps)))

for (cellType in rownames(counts.xCell.filtered)) {
  MEscore.df <- data.frame("score" = counts.xCell[cellType,],
                         "cluster" = factor(myclusters),
                         row.names = names(myclusters))
  
  print(
    ggplot(MEscore.df, aes(x=cluster, y=score, fill=cluster)) +
    geom_violin(draw_quantiles = 0.5) +
    labs(x="", y="Score") + stat_compare_means(method = "wilcox.test", comparisons = comps, label = "p.signif") +
      ggtitle(paste0("Cluster comparison of ", cellType))
  )
}
## Warning in wilcox.test.default(c(6.22473907890318e-20, 0,
## 3.69228672115654e-19, : cannot compute exact p-value with ties
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in wilcox.test.default(c(0.0447800780643638, 0.0361506684239787, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(c(0.060853369038637, 0.0620371133438586, : cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(c(0, 0.000953708399021774, 0,
## 0.000356038440947778, : cannot compute exact p-value with ties

## Warning in wilcox.test.default(c(0.0372369819894889, 0.0296787087706935, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(c(0.362334967151523, 0.250261268641754,
## 0.216767980997323, : cannot compute exact p-value with ties

## Warning in wilcox.test.default(c(0.0378188874232337, 0.0363499732698856, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(c(0.0666786498605753, 0.0528774333428992, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(c(2.08066611023593e-18, 0.0237624883504869, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(c(2.08066611023593e-18, 0.0237624883504869, :
## collapsing to unique 'x' values

## Warning in wilcox.test.default(c(1.18458162073551e-17, 0.0699227701494967, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(c(0.080287859217708, 0.0726135021758999, : cannot
## compute exact p-value with ties

chosenCells = c("Th1 cells", "Smooth muscle", "MSC")
scores.df <- data.frame("cluster" = factor(myclusters),
                         row.names = names(myclusters))
for (i in seq_along(chosenCells)) {
  scores.df[[paste0("score", i)]] <- counts.xCell[chosenCells[i],]
}

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
ggplot(scores.df, aes(x=score1, y=score2, color=cluster)) + geom_point()

plot_ly(x=scores.df$score1, y=scores.df$score2, z=scores.df$score3, mode="markers", color=scores.df$cluster)
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels